In [1]:
import pandas as pd
df = pd.read_csv("C:\\Users\\pkeer\\Downloads\\archive (12)\\traffic.csv") 
In [2]:
df.head()
Out[2]:
event date country city artist album track isrc linkid
0 click 2021-08-21 Saudi Arabia Jeddah Tesher Jalebi Baby Jalebi Baby QZNWQ2070741 2d896d31-97b6-4869-967b-1c5fb9cd4bb8
1 click 2021-08-21 Saudi Arabia Jeddah Tesher Jalebi Baby Jalebi Baby QZNWQ2070741 2d896d31-97b6-4869-967b-1c5fb9cd4bb8
2 click 2021-08-21 India Ludhiana Reyanna Maria So Pretty So Pretty USUM72100871 23199824-9cf5-4b98-942a-34965c3b0cc2
3 click 2021-08-21 France Unknown Simone & Simaria, Sebastian Yatra No Llores Más No Llores Más BRUM72003904 35573248-4e49-47c7-af80-08a960fa74cd
4 click 2021-08-21 Maldives Malé Tesher Jalebi Baby Jalebi Baby QZNWQ2070741 2d896d31-97b6-4869-967b-1c5fb9cd4bb8
In [3]:
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 226278 entries, 0 to 226277
Data columns (total 9 columns):
 #   Column   Non-Null Count   Dtype 
---  ------   --------------   ----- 
 0   event    226278 non-null  object
 1   date     226278 non-null  object
 2   country  226267 non-null  object
 3   city     226267 non-null  object
 4   artist   226241 non-null  object
 5   album    226273 non-null  object
 6   track    226273 non-null  object
 7   isrc     219157 non-null  object
 8   linkid   226278 non-null  object
dtypes: object(9)
memory usage: 15.5+ MB
In [4]:
df.shape
Out[4]:
(226278, 9)
In [5]:
df['date'] = pd.to_datetime(df['date'], errors='coerce')
In [6]:
df = df.dropna(subset=['country', 'city', 'artist', 'album', 'track'])
In [7]:
df = df.drop_duplicates()
In [8]:
print(df.isnull().sum())
event         0
date          0
country       0
city          0
artist        0
album         0
track         0
isrc       6277
linkid        0
dtype: int64
In [9]:
df.info()
<class 'pandas.core.frame.DataFrame'>
Index: 122534 entries, 0 to 226274
Data columns (total 9 columns):
 #   Column   Non-Null Count   Dtype         
---  ------   --------------   -----         
 0   event    122534 non-null  object        
 1   date     122534 non-null  datetime64[ns]
 2   country  122534 non-null  object        
 3   city     122534 non-null  object        
 4   artist   122534 non-null  object        
 5   album    122534 non-null  object        
 6   track    122534 non-null  object        
 7   isrc     116257 non-null  object        
 8   linkid   122534 non-null  object        
dtypes: datetime64[ns](1), object(8)
memory usage: 9.3+ MB
In [10]:
event_counts = df['event'].value_counts()
print("Event Counts:\n", event_counts)
Event Counts:
 event
pageview    73338
click       32489
preview     16707
Name: count, dtype: int64
In [11]:
top_countries = df['country'].value_counts().head(10)
print("Top Countries:\n", top_countries)
Top Countries:
 country
United States     28640
India             18689
France            10565
Saudi Arabia       7682
United Kingdom     5095
Germany            4015
Canada             2784
Pakistan           2633
Iraq               2444
Turkey             2399
Name: count, dtype: int64
In [12]:
top_artists = df['artist'].value_counts().head(10)
print("Top Artists:\n", top_artists)
Top Artists:
 artist
Tesher                                                          8288
Anne-Marie                                                      4029
Tundra Beats                                                    3951
Roddy Ricch                                                     3107
Olivia Rodrigo                                                  3037
Surf Mesa, Emilee                                               2956
DMNDS, Strange Fruits Music, Fallen Roses, Lujavo, Nito-Onna    2865
Reyanna Maria                                                   2672
PinkPantheress                                                  2446
50 Cent, Olivia                                                 2390
Name: count, dtype: int64
In [13]:
top_tracks = df['track'].value_counts().head(10)
print("Top Tracks:\n", top_tracks)
Top Tracks:
 track
Jalebi Baby                              8288
Beautiful                                4037
Beautiful Day                            3951
Late At Night                            3059
ily (i love you baby) (feat. Emilee)     2956
Calabria (feat. Lujavo & Nito-Onna)      2865
So Pretty                                2827
Candy Shop                               2397
Summer of Love (Shawn Mendes & Tainy)    2108
Build a Bitch                            2072
Name: count, dtype: int64
In [14]:
daily_activity = df['date'].value_counts().sort_index()
print("Daily Activity:\n", daily_activity.head())
Daily Activity:
 date
2021-08-19    21143
2021-08-20    18522
2021-08-21    16701
2021-08-22    16927
2021-08-23    16412
Name: count, dtype: int64
In [15]:
df.groupby('linkid')['event'].count().describe()

# Number of unique event types per session (complexity of user flow)
df.groupby('linkid')['event'].nunique().value_counts()
Out[15]:
event
2    1811
1    1547
3     464
Name: count, dtype: int64
In [16]:
df['country'].value_counts().head(10)
df['city'].value_counts().head(10)
Out[16]:
city
Unknown        8790
Jeddah         2497
Riyadh         2232
Hyderabad      1088
Dammam         1002
Delhi           884
Jaipur          849
Lucknow         837
Kuwait City     816
Ahmedabad       808
Name: count, dtype: int64
In [17]:
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(style="whitegrid")
plt.figure(figsize=(16, 20))
plt.subplot(3, 2, 1)
sns.barplot(x=event_counts.index, y=event_counts.values, palette="Set2")
plt.title("Event Distribution")
Out[17]:
Text(0.5, 1.0, 'Event Distribution')
No description has been provided for this image
In [18]:
plt.subplot(1, 2, 2)
sns.barplot(x=top_countries.values, y=top_countries.index, palette="Blues_d")
plt.title("Top 10 Countries")
Out[18]:
Text(0.5, 1.0, 'Top 10 Countries')
No description has been provided for this image
In [19]:
# Top Artists
plt.subplot(2, 2, 3)
sns.barplot(x=top_artists.values, y=top_artists.index, palette="Purples_d")
plt.title("Top 10 Artists")
Out[19]:
Text(0.5, 1.0, 'Top 10 Artists')
No description has been provided for this image
In [20]:
# Top Tracks
plt.subplot(2, 2, 4)
sns.barplot(x=top_tracks.values, y=top_tracks.index, palette="Greens_d")
plt.title("Top 10 Tracks")
Out[20]:
Text(0.5, 1.0, 'Top 10 Tracks')
No description has been provided for this image
In [21]:
# Daily Activity Line Plot
plt.subplot(3, 1, 3)
sns.lineplot(x=daily_activity.index, y=daily_activity.values, marker="o")
plt.title("Daily User Activity")
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()
C:\Users\pkeer\anaconda3\Lib\site-packages\seaborn\_oldcore.py:1119: FutureWarning: use_inf_as_na option is deprecated and will be removed in a future version. Convert inf values to NaN before operating instead.
  with pd.option_context('mode.use_inf_as_na', True):
C:\Users\pkeer\anaconda3\Lib\site-packages\seaborn\_oldcore.py:1119: FutureWarning: use_inf_as_na option is deprecated and will be removed in a future version. Convert inf values to NaN before operating instead.
  with pd.option_context('mode.use_inf_as_na', True):
No description has been provided for this image
In [22]:
# Daily event counts
daily_traffic = df['date'].dt.date.value_counts().sort_index()
plt.figure(figsize=(14, 5))
sns.lineplot(x=daily_traffic.index, y=daily_traffic.values, marker='o')
plt.title("Daily Website Traffic")
plt.xlabel("Date")
plt.ylabel("Number of Events")
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()
C:\Users\pkeer\anaconda3\Lib\site-packages\seaborn\_oldcore.py:1119: FutureWarning: use_inf_as_na option is deprecated and will be removed in a future version. Convert inf values to NaN before operating instead.
  with pd.option_context('mode.use_inf_as_na', True):
C:\Users\pkeer\anaconda3\Lib\site-packages\seaborn\_oldcore.py:1119: FutureWarning: use_inf_as_na option is deprecated and will be removed in a future version. Convert inf values to NaN before operating instead.
  with pd.option_context('mode.use_inf_as_na', True):
No description has been provided for this image
In [23]:
df['day_of_week'] = df['date'].dt.day_name()

# Count by day of week
dow_traffic = df['day_of_week'].value_counts().reindex([
    'Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday'
])

# Plot
plt.figure(figsize=(8, 5))
sns.barplot(x=dow_traffic.index, y=dow_traffic.values, palette="viridis")
plt.title("Traffic by Day of the Week")
plt.ylabel("Number of Events")
plt.xlabel("Day of Week")
plt.show()
No description has been provided for this image
In [24]:
df.set_index('date', inplace=True)

# Weekly trend
weekly_traffic = df.resample('W').size()

plt.figure(figsize=(14, 5))
sns.lineplot(x=weekly_traffic.index, y=weekly_traffic.values, marker="o")
plt.title("Weekly Website Traffic")
plt.ylabel("Events per Week")
plt.xlabel("Week")
plt.show()

# Optional: Monthly
monthly_traffic = df.resample('M').size()
C:\Users\pkeer\anaconda3\Lib\site-packages\seaborn\_oldcore.py:1119: FutureWarning: use_inf_as_na option is deprecated and will be removed in a future version. Convert inf values to NaN before operating instead.
  with pd.option_context('mode.use_inf_as_na', True):
C:\Users\pkeer\anaconda3\Lib\site-packages\seaborn\_oldcore.py:1119: FutureWarning: use_inf_as_na option is deprecated and will be removed in a future version. Convert inf values to NaN before operating instead.
  with pd.option_context('mode.use_inf_as_na', True):
No description has been provided for this image
In [25]:
print(df.columns.tolist())
['event', 'country', 'city', 'artist', 'album', 'track', 'isrc', 'linkid', 'day_of_week']
In [26]:
import plotly.graph_objects as go
df['event'] = df['event'].str.lower()
session_events = df.groupby('linkid')['event'].apply(set)
stage1 = session_events.apply(lambda x: 'pageview' in x).sum()
stage2 = session_events.apply(lambda x: {'pageview', 'preview'}.issubset(x)).sum()
stage3 = session_events.apply(lambda x: {'pageview', 'preview', 'click'}.issubset(x)).sum()

# Visualize funnel
funnel_values = [stage1, stage2, stage3]
funnel_labels = ['Pageview', 'Preview', 'Click']

fig = go.Figure(go.Funnel(
    y=funnel_labels,
    x=funnel_values,
    textinfo="value+percent initial+percent previous",
    marker=dict(color=["#636EFA", "#00CC96", "#EF553B"])
))
fig.update_layout(title="Conversion Funnel")
fig.show()
In [27]:
conversion_rate = stage3 / stage1 * 100 if stage1 else 0
dropoff_preview = (stage1 - stage2) / stage1 * 100 if stage1 else 0
dropoff_click = (stage2 - stage3) / stage2 * 100 if stage2 else 0

print(f"Drop-off after Preview: {dropoff_preview:.2f}%")
print(f"Drop-off before Click: {dropoff_click:.2f}%")
print(f"Final Conversion Rate: {conversion_rate:.2f}% (Pageview → Click)")
Drop-off after Preview: 87.04%
Drop-off before Click: 6.26%
Final Conversion Rate: 12.15% (Pageview → Click)
In [28]:
df['event'] = df['event'].str.lower()

# Create a country vs event count pivot
country_event_pivot = df.pivot_table(index='country', columns='event', values='linkid', aggfunc='count', fill_value=0)

# Calculate click-through rate (CTR)
country_event_pivot['click_rate'] = (country_event_pivot['click'] / country_event_pivot['pageview']) * 100

# Top 10 countries by click rate (with enough data)
top_ctr = country_event_pivot[country_event_pivot['pageview'] > 50].sort_values('click_rate', ascending=False).head(10)

plt.figure(figsize=(10, 5))
sns.barplot(x=top_ctr['click_rate'], y=top_ctr.index, palette="coolwarm")
plt.title("Top 10 Countries by Click-Through Rate (CTR)")
plt.xlabel("Click-Through Rate (%)")
plt.ylabel("Country")
plt.show()
No description has been provided for this image
In [29]:
city_event_pivot = df.pivot_table(index='city', columns='event', values='linkid', aggfunc='count', fill_value=0)
city_event_pivot['click_rate'] = (city_event_pivot['click'] / city_event_pivot['pageview']) * 100
top_city_ctr = city_event_pivot[city_event_pivot['pageview'] > 50].sort_values('click_rate', ascending=False).head(10)

plt.figure(figsize=(10, 5))
sns.barplot(x=top_city_ctr['click_rate'], y=top_city_ctr.index, palette="plasma")
plt.title("Top Cities by Click-Through Rate (CTR)")
plt.xlabel("Click-Through Rate (%)")
plt.ylabel("City")
plt.show()
No description has been provided for this image
In [30]:
artist_event_pivot = df.pivot_table(index='artist', columns='event', values='linkid', aggfunc='count', fill_value=0)
artist_event_pivot['conversion_rate'] = (artist_event_pivot['click'] / artist_event_pivot['pageview']) * 100

top_converting_artists = artist_event_pivot[artist_event_pivot['pageview'] > 50].sort_values('conversion_rate', ascending=False).head(10)

plt.figure(figsize=(10, 5))
sns.barplot(x=top_converting_artists['conversion_rate'], y=top_converting_artists.index, palette="light:#5A9")
plt.title("Top 10 Artists by Conversion Rate (Click/Pageview)")
plt.xlabel("Conversion Rate (%)")
plt.ylabel("Artist")
plt.show()
No description has been provided for this image
In [31]:
country_name = 'India'
top_tracks_in_country = df[df['country'] == country_name]['track'].value_counts().head(10)

plt.figure(figsize=(10, 5))
sns.barplot(x=top_tracks_in_country.values, y=top_tracks_in_country.index, palette='rocket')
plt.title(f"Top 10 Tracks in {country_name}")
plt.xlabel("Number of Events")
plt.ylabel("Track")
plt.show()
No description has been provided for this image
In [ ]: